EDA: Overview of Suicide Rate in a Particular Year

Author

Michael

Published

March 3, 2023

Modified

March 4, 2023

Step-by-Step Data Preparation

1. Installing and launching required R packages

pacman::p_load("patchwork", "tmap", "ExPanDaR", "kableExtra", "ggstatsplot", "plotly", "DT", "tidyverse")

2. Loading the data

suicidedata_eda_formap <- read_csv("data/suicidedata_eda_formap.csv", show_col_types = FALSE)
suicidedata_eda <- read_csv("data/suicidedata_eda.csv", show_col_types = FALSE)

3. Dataset overview

We will check the suicidedata_eda as suicidedata_eda_formap is meant to be combined with the sf file (world)

3.1 Missing Values

missing.values <- suicidedata_eda %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 
missing.values %>% kable()
key num.missing
DN 18360
DR 18360
POP 18360
SN 18360
SP 18360

Visualising Missing Values

missing_values <- function(vari = "age_name"){
prepare_missing_values_graph(suicidedata_eda, ts_id = vari)
}

By country

missing_values("country")

By gender

missing_values("sex_name")

By age group

missing_values("age_name")

By year

missing_values("year")

Conclusion : The missing values are because of age-standardised (only for Suicide Rates)

3.2 Descriptive statistics

descr_stats <- function(gender = "Both", age = "All ages") {
  
  descr <- prepare_descriptive_table(suicidedata_eda %>%
                                       filter(sex_name == gender,
                                              age_name == age) %>%
                                       select(!c(6,8)) %>%
                                       rename("Number of suicide" = "SN",
                                              "Number of deaths" = "DN",
                                              "Share of deaths from suicide (%)" = "SP",
                                              "Suicide rate" = "SR",
                                              "Mortality rate" = "DR"))
descr$kable_ret  %>%
  kable_styling("condensed", full_width = F, position = "center")
}
descr_stats("Both", "All ages")
Descriptive Statistics
N Mean Std. dev. Min. 25 % Median 75 % Max.
Number of suicide 6,120 3,874.828 18,425.617 0.126 93.934 532.835 1,882.305 220,356.720
Number of deaths 6,120 252,265.821 920,824.636 10.886 7,358.955 51,487.069 169,570.300 10,659,491.040
Share of deaths from suicide (%) 6,120 1.394 1.067 0.105 0.691 1.118 1.787 11.812
Suicide rate 6,120 11.339 9.395 1.427 5.424 8.224 14.601 95.565
Mortality rate 6,120 847.052 354.531 148.258 611.956 783.986 1,019.318 10,705.808

3.3 Distribution

3.3.1 Histogram of multiple variables
plot_multi_distribution <- function(gender = "Both", age = "All ages") {
  
suicidedata_hist <- suicidedata_eda %>% 
  filter(sex_name == gender,
         age_name == age) %>%
  select(c(7,9,10,11,12)) %>%
  rename("Number of suicide" = "SN",
         "Number of deaths" = "DN",
         "Share of deaths from suicide (%)" = "SP",
         "Suicide rate" = "SR",
         "Mortality rate" = "DR")
  
plot <- suicidedata_hist %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap( ~key, ncol=3, scales="free") +
  geom_histogram()

ggplotly(plot)
}
plot_multi_distribution("Both", "All ages")

It does not make sense to look at numbers due to different population sizes. Hence we will only look at Suicide Rate, Mortality Rate and Share of deaths from suicide (%) from now on

3.3.2 Histogram of individual variables
plot_distribution <- function(gender = "Both", age = "All ages", vari = SR, title = "Suicide Rate", binwidth = 2) {
  
suicidedata_hist <- suicidedata_eda %>% 
  filter(sex_name == gender,
         age_name == age)

suicidedata_summary <- suicidedata_hist %>%
  summarise(mean = round(mean({{vari}})),
            median = round(median({{vari}})),
            ymax = as.numeric(round((IQR({{vari}})*1.5) + quantile({{vari}},0.75))),
            ymin = as.integer(min({{vari}})),
            rangemax = as.integer(max({{vari}}))) 

#computing summary statistics of mean, median and lower and upper whiskers in boxplot
var_mean <- suicidedata_summary$mean
var_median <- suicidedata_summary$median
ymax <- suicidedata_summary$ymax
ymin <- suicidedata_summary$ymin
rangemax <- suicidedata_summary$rangemax

#plotting histogram
h1 <- ggplot(suicidedata_hist, aes(x = {{vari}})) + 
  geom_histogram(color="black", fill="azure4", binwidth = binwidth)  
  
h <- h1 + scale_x_continuous(limits = c(0,rangemax), labels = scales::comma) +
  labs(x = paste0(title,", ",gender,", ",age), y = "Count") +
  geom_vline(aes(xintercept = var_mean), col="darkblue", linewidth=1, linetype = "dashed") +
  annotate("text", x=var_mean*1.05, y=max(ggplot_build(h1)$data[[1]]$count)*1.2, label="Mean:", 
           size=4, color="darkblue", hjust = 0) +
  annotate("text", x=var_mean*1.05, y=max(ggplot_build(h1)$data[[1]]$count)*1.1, label=format(var_mean, big.mark = ","),
           size=4, color="darkblue", hjust = 0) +
  geom_vline(aes(xintercept = var_median), col="lightpink4", linewidth=1, linetype = "dashed") +
  annotate("text", x=var_median*0.95, y=max(ggplot_build(h1)$data[[1]]$count)*1.2, label="Median:", 
           size=4, color="lightpink4", hjust = 1) +
  annotate("text", x=var_median*0.95, y=max(ggplot_build(h1)$data[[1]]$count)*1.1, label=format(var_median, big.mark = ","),
           size=4, color="lightpink4", hjust = 1) +
  theme(axis.text.x = element_text(size=8))

#plotting boxplot
b <- ggplot(suicidedata_hist, aes(y = {{vari}})) + 
  geom_boxplot(outlier.colour="firebrick", outlier.shape=16,
               outlier.size=1, notch=FALSE) + 
  coord_flip() + labs(y = "", x = "") + 
  scale_y_continuous(limits = c(0,rangemax), labels = scales::comma) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) + 
  stat_boxplot(geom="errorbar", width=0.5) +
  annotate("text", x=0.35, y=ymax, label=format(ymax, big.mark = ","), 
           size=3, color="lightpink4") +
  annotate("text", x=0.35, y=ymin, label=format(ymin, big.mark = ","), 
           size=3, color="lightpink4")

#combining plots
suicide_distri <- b / h + plot_layout(heights = c(1, 4)) 

suicide_distri
}
plot_distribution("Both", "All ages", SR, "Suicide Rate", 2)

3.4 Outliers

outliers_table <- function(gender = "Both", age = "All ages", vari = "Suicide rate") {
  
  outliers <- prepare_ext_obs_table(suicidedata_eda %>%
                                       filter(sex_name == gender,
                                              age_name == age) %>%
                                      select(!c(2,3,4,5)) %>%
                                      rename("Share of deaths from suicide (%)" = "SP",
                                             "Suicide rate" = "SR",
                                             "Mortality rate" = "DR"),
                                    n = 10,
                                    cs_id = "country",
                                    ts_id = "year",
                                    var = vari)
    
outliers$kable_ret  %>%
  kable_styling("condensed", full_width = F, position = "center")
}
outliers_table("Both", "All ages", "Suicide rate")
country year Suicide rate
Greenland 1,993 95.565
Greenland 1,994 94.916
Greenland 1,995 94.541
Greenland 1,990 93.745
Greenland 1,996 93.518
Greenland 1,992 93.423
Greenland 1,991 93.390
Greenland 1,997 92.778
Greenland 1,998 90.744
Greenland 1,999 88.223
... ... ...
Jamaica 1,993 1.586
Sao Tome and Principe 1,994 1.527
Jamaica 1,992 1.526
Sao Tome and Principe 1,995 1.514
Sao Tome and Principe 1,993 1.514
Sao Tome and Principe 1,992 1.467
Sao Tome and Principe 1,991 1.446
Jamaica 1,990 1.439
Jamaica 1,991 1.431
Sao Tome and Principe 1,990 1.427

More outliers on the high-end as compared to the other side with Greenland dominating the outliers. There might be interesting pattern here, hence windsorising the data using treat_outliers() function might not be recommended

4. Comparing between countries and regions within a particular year

4.1 Plotting the choropleth map

4.1.1 Joining with world map (tmap object)

The object World is a spatial object of class sf from the sf package; it is a data.frame with a special column that contains a geometry for each row, in this case polygons

Reference - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html

data("World")

Joining the two dataframes together

suicidedata_eda_map <- left_join(World, 
                                 suicidedata_eda_formap %>% mutate(across(where(is.numeric), round, 2)),
                          by = c("iso_a3" = "code")) %>%
  select(!c(2,3,4,6,7,8,9,10,11,12,13,14,15)) %>%
  mutate(area = as.numeric(str_remove(`area`, 
                                " \\[km\\^2\\]")), 
         .after = region) %>%
  na.omit()
4.1.2 Creating function to plot choropleth map
plot_map_eda <- function(year, metric = "SR", gender = "T", style = "jenks"){
  
metric_text = case_when(metric == "SR" ~ "Suicide rate",
                        metric == "SP" ~ "Share of deaths from suicide (%)",
                        metric == "SN" ~ "Number of suicide",
                        metric == "DN" ~ "Number of deaths",
                        metric == "DR" ~ "Mortality rate")

age = case_when(metric == "SR" ~ "AS",
                metric == "SP" ~ "All",
                metric == "SN" ~ "All",
                metric == "DN" ~ "All",
                metric == "DR" ~ "All")

gender_text = case_when(gender == "T" ~ "Total",
                        gender == "M" ~ "Male",
                        gender == "F" ~ "Female")
  
tmap_mode("view")

tm_shape(suicidedata_eda_map |> 
           filter(year == year))+
  tm_fill(paste0(metric,"_",age,"_",gender), 
          style = style, 
          palette="YlOrBr", 
          id = "country",
          title = paste0(metric_text, ", ",gender_text,", ", year),
          popup.vars = c(value = paste0(metric,"_",age,"_",gender))) +
  
  tm_borders(col = "grey20",
             alpha = 0.5) 
}
plot_map_eda(2016, "SR", "M", "jenks")
tmap_mode("plot")

4.2 Statistical testing on difference in suicide metrics between Regions

Note that it does not make sense to compare the statistical testing between countries within a particular year as there is only one metric per country. As such, statistical test will be done for a time period for a country (in EDA Time Series segment)

4.2.1.1 Normality assumption

Plotting histogram by regions

plot_multi_distribution_region <- function(year, gender = "Both", age = "All ages", vari = SR) {
  
suicidedata_hist <- suicidedata_eda %>% 
  filter(year == year,
         sex_name == gender,
         age_name == age) %>%
  select(c(3,7,8,9,10,11,12))
  
plot <- suicidedata_hist %>% 
  ggplot(aes(x = {{vari}})) +
  facet_wrap( ~region, ncol=4, scales="free") +
  geom_histogram()

ggplotly(plot)
}
plot_multi_distribution_region(2010, "Both", "All ages", SR)

Anderson-Darling Test by Region

adtest_region <- function(year, gender = "Both", age = "All ages", vari = "SR") {

suicidedata_ad <- suicidedata_eda %>% 
  filter(year == year,
         sex_name == gender,
         age_name == age) %>%
  select(c(3,7,8,9,10,11,12))
    
normaltestlist <- list()
for (i in unique(suicidedata_ad$region)){
  subdf <- subset(x = suicidedata_ad, subset=region==i)
  normaltestlist[[i]] <- nortest::ad.test(subdf[[vari]])
}

normaltest <- tibble(town = unique(suicidedata_ad$region),
                     p_value = unlist(lapply(normaltestlist, `[[`, 2))) 

names(normaltest)[2] <- paste0(vari, " p-value")

DT::datatable(normaltest, class= "compact")
}
adtest_region(2010, "Both", "All ages", "SR")
4.2.1.2 ANOVA Test
plot_ANOVA_region <- function(year, gender = "Both", age = "All ages", vari = SR, title = "Suicide rate", type = "np", conf = 0.95) {

text <- case_when(type == "p" ~ "Mean",
                  type == "np" ~ "Median" )
    
suicidedata_ANOVA <- suicidedata_eda %>% 
  filter(year == year,
         sex_name == gender,
         age_name == age) %>%
  select(c(3,7,8,9,10,11,12))

suicidedata_ANOVA %>% ggbetweenstats(x = region, y = {{vari}},
               xlab = "Region", ylab = title,
               type = type, pairwise.comparisons = T, pairwise.display = "ns", 
               mean.ci = T, p.adjust.method = "fdr",  conf.level = conf,
               title = paste0("Comparison of ",text," ",title, " (", gender, " gender, ", age, ") across Regions in ", year),
               subtitle = paste0("One-way ANOVA at ", conf*100, "% confidence level"),
               package = "ggthemes", palette = "Tableau_10")

}
plot_ANOVA_region(2010, "Both", "Age-standardized", SR, type = "np")